import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import sweetviz as sv
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')# Loading the dataset # df = pd.read_csv(r"..\data\complaints.csv")
# Loading my 300k sample data
df = pd.read_parquet("../data/processed/cfpb_sample_300k.parquet")print(f"Loaded {len(df):,} rows & {len(df.columns)} columns")
df.head()| Date received | Product | Sub-product | Issue | Sub-issue | Consumer complaint narrative | Company public response | Company | State | ZIP code | ... | Date sent to company | Company response to consumer | Timely response? | Consumer disputed? | Complaint ID | year_quarter | geo | region | stratum | sample_n | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2012-03-14 | Bank account or service | Checking account | Making/receiving payments, sending money | None | None | None | BANK OF AMERICA, NATIONAL ASSOCIATION | ND | 58503 | ... | 2012-03-15 | Closed with relief | Yes | No | 35052 | 2012Q1 | ND | Midwest | Bank account or service|2012Q1|Midwest | 4 |
| 1 | 2012-03-20 | Bank account or service | Checking account | Problems caused by my funds being low | None | None | None | TCF NATIONAL BANK | MN | 55125 | ... | 2012-03-21 | Closed with relief | Yes | No | 37573 | 2012Q1 | MN | Midwest | Bank account or service|2012Q1|Midwest | 4 |
| 2 | 2012-03-22 | Bank account or service | Checking account | Making/receiving payments, sending money | None | None | None | WELLS FARGO & COMPANY | MN | 55110 | ... | 2012-03-23 | Closed without relief | Yes | Yes | 39793 | 2012Q1 | MN | Midwest | Bank account or service|2012Q1|Midwest | 4 |
| 3 | 2012-03-07 | Bank account or service | Checking account | Making/receiving payments, sending money | None | None | None | Synovus Bank | OH | 44108 | ... | 2012-03-16 | Closed without relief | Yes | No | 34571 | 2012Q1 | OH | Midwest | Bank account or service|2012Q1|Midwest | 4 |
| 4 | 2012-03-20 | Bank account or service | Checking account | Problems caused by my funds being low | None | None | None | PNC Bank N.A. | PA | 18944 | ... | 2012-03-23 | Closed without relief | Yes | Yes | 37047 | 2012Q1 | PA | Northeast | Bank account or service|2012Q1|Northeast | 6 |
5 rows × 23 columns
profile = ProfileReport(
df,
title="CFPB 300k Sample - YData Profiling Report",
explorative=True # richer, but still reasonable runtime
)profile.to_notebook_iframe()
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
# Save to HTML
profile.to_file("../reports/cfpb_300k_profile.html")Export report to file: 0%| | 0/1 [00:00<?, ?it/s]
# South vs rest (your >50% sample focus) south_df = df[df['region'] == 'South'].copy() other_df = df[df['region'] != 'South'].copy()
# Generate comparison report sweet_report = sv.compare([south_df, "South"], [other_df, "Others"])
| | [ 0%] 00:00 -> (? left)
# Notebook iframe sweet_report.show_notebook(scale=0.9)
sweet_report.show_html('../reports/south_vs_others_sweetviz.html')